1 Setup

options(connectionObserver = NULL)
suppressPackageStartupMessages({
  library(DESeq2)
  library(edgeR)
  library(here)
  library(ggtree)
  library(gplots)
  library(hyperSpec)
  library(LSD)
  library(org.Hs.eg.db)
  library(plotly)
  library(pvclust)
  library(RColorBrewer)
  library(SummarizedExperiment)
  library(tidyverse)
  library(treeio)
  library(tximeta)
  library(vsn)
})
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
pal <- brewer.pal(8,"Dark2")

dex and cell will be variables in our model. As they are categorical, we change them to be factors

meta <- read.delim(here("data/airway/airway_meta.txt"), 
                   header = TRUE, as.is = TRUE)
rownames(meta) <- meta$names
meta$dex <- factor(meta$dex)
meta$cell <- factor(meta$cell)

A look at it

meta
##            SampleName    cell   dex albut      names avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677

What is a factor?

meta$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: trt untrt
levels(meta$dex)
## [1] "trt"   "untrt"
as.integer(meta$dex)
## [1] 2 1 2 1 2 1 2 1

2 Alignment vs. pseudo-alignment comparison

2.1 featureCounts

The following code, we do not run, it is there to show how the featureCounts data was generated. featureCounts is a function part of the Rsubread library. It reads BAM files, generated by a “classical” aligner and summarises the overlap of the alignments with the gene annotation. I.e. for every gene locus, it counts the number of reads that aligned there. featureCounts can handle multi-mapping reads, but not in a probabilistic manner.

library(Rsubread)
fc <- featureCounts(files = files, 
                    annot.ext = gtf, 
                    isGTFAnnotationFile = TRUE,
                    GTF.featureType = "exon", 
                    GTF.attrType = "gene_id", 
                    useMetaFeatures = TRUE, 
                    strandSpecific = 0, 
                    isPairedEnd = TRUE, 
                    nthreads = 6)
Read the data generated from the command above
fc <- readRDS(here("data/airway/featureCounts/star_featureCounts.rds"))

let us inspect it

names(fc)
## [1] "counts"     "annotation" "targets"    "stat"
counts_featurecounts <- fc$counts
head(counts_featurecounts)
##                   SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000223972.5         29         28         23         16         21
## ENSG00000227232.5        843        701       1102        495        838
## ENSG00000278267.1         26         19         24         16         13
## ENSG00000243485.5         20          9         30          9         10
## ENSG00000284332.1          0          0          0          0          0
## ENSG00000237613.2         19          6         19          4          8
##                   SRR1039517 SRR1039520 SRR1039521
## ENSG00000223972.5         30          5          2
## ENSG00000227232.5        967        578        573
## ENSG00000278267.1         20         12         16
## ENSG00000243485.5          4          9          3
## ENSG00000284332.1          0          0          0
## ENSG00000237613.2          6         14          7
dim(counts_featurecounts)
## [1] 58721     8
fc$stat
##                           Status SRR1039508 SRR1039509 SRR1039512 SRR1039513
## 1                       Assigned   21007653   19184955   25895949   15496792
## 2            Unassigned_Unmapped          0          0          0          0
## 3      Unassigned_MappingQuality          0          0          0          0
## 4             Unassigned_Chimera          0          0          0          0
## 5      Unassigned_FragmentLength          0          0          0          0
## 6           Unassigned_Duplicate          0          0          0          0
## 7        Unassigned_MultiMapping          0          0          0          0
## 8           Unassigned_Secondary          0          0          0          0
## 9            Unassigned_NonSplit          0          0          0          0
## 10         Unassigned_NoFeatures    1922304    1410877    2030626    1083972
## 11 Unassigned_Overlapping_Length          0          0          0          0
## 12          Unassigned_Ambiguity    1358418    1236366    1584019     943994
##    SRR1039516 SRR1039517 SRR1039520 SRR1039521
## 1    24998626   31477332   19500283   21608001
## 2           0          0          0          0
## 3           0          0          0          0
## 4           0          0          0          0
## 5           0          0          0          0
## 6           0          0          0          0
## 7           0          0          0          0
## 8           0          0          0          0
## 9           0          0          0          0
## 10    2133471    2509956    1539435    1791941
## 11          0          0          0          0
## 12    1556731    1927932    1186465    1283094

let us visualise it

stats <- as.matrix(fc$stat %>% column_to_rownames("Status"))
barplot(stats[rowSums(stats)>0,],
        beside = TRUE,col = pal[1:3],las=2,
        cex.axis = .8,cex.names = .8,
        legend.text = rownames(stats)[rowSums(stats)>0],
        args.legend = list(horiz=TRUE,cex=.8,x="top",bty="n"))

2.2 Salmon

We list all quant.sf output files from Salmon

salmonfiles <- here("data/airway/salmon/", 
                      meta$names, "/quant.sf")
names(salmonfiles) <- meta$names
stopifnot(all(file.exists(salmonfiles)))

We extend the metadata by adding a column “files” to the metadata table. This table must contain at least two columns: “names” and “files” for importing the data using the tximeta package

coldata <- cbind(meta, files = salmonfiles, 
                 stringsAsFactors = FALSE)

How does it look

coldata
##            SampleName    cell   dex albut      names avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
##                                                                          files
## SRR1039508 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039508//quant.sf
## SRR1039509 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039509//quant.sf
## SRR1039512 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039512//quant.sf
## SRR1039513 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039513//quant.sf
## SRR1039516 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039516//quant.sf
## SRR1039517 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039517//quant.sf
## SRR1039520 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039520//quant.sf
## SRR1039521 /home/Nicolas/BioQAExample/data/airway/salmon//SRR1039521//quant.sf

Now, we can import the quantifications

Remember that Salmon quantifies abundances at the transcript level

This function will fetch from the Gencode archive (locate at the European Bioinformatics Institue), the annotation relevant to our project, including the transcript information and that of their parental gene

Note: enter 2 when prompted

st <- tximeta::tximeta(coldata)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2021-06-10 22:59:18
## Loading required package: GenomicFeatures
## loading existing transcript ranges created: 2021-06-10 22:59:21

let us take a look at it

st
## class: RangedSummarizedExperiment 
## dim: 205870 8 
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(205870): ENST00000456328.2 ENST00000450305.2 ...
##   ENST00000387460.2 ENST00000387461.2
## rowData names(3): tx_id gene_id tx_name
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
metadata(st)
## $tximetaInfo
## $tximetaInfo$version
## [1] '1.8.4'
## 
## $tximetaInfo$type
## [1] "salmon"
## 
## $tximetaInfo$importTime
## [1] "2021-06-13 12:17:41 CEST"
## 
## 
## $quantInfo
## $quantInfo$salmon_version
## [1] "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0" "0.12.0"
## 
## $quantInfo$samp_type
## [1] "none" "none" "none" "none" "none" "none" "none" "none"
## 
## $quantInfo$opt_type
## [1] "vb" "vb" "vb" "vb" "vb" "vb" "vb" "vb"
## 
## $quantInfo$quant_errors
## $quantInfo$quant_errors[[1]]
## list()
## 
## $quantInfo$quant_errors[[2]]
## list()
## 
## $quantInfo$quant_errors[[3]]
## list()
## 
## $quantInfo$quant_errors[[4]]
## list()
## 
## $quantInfo$quant_errors[[5]]
## list()
## 
## $quantInfo$quant_errors[[6]]
## list()
## 
## $quantInfo$quant_errors[[7]]
## list()
## 
## $quantInfo$quant_errors[[8]]
## list()
## 
## 
## $quantInfo$num_libraries
## [1] 1 1 1 1 1 1 1 1
## 
## $quantInfo$library_types
## [1] "IU" "IU" "IU" "IU" "IU" "IU" "IU" "IU"
## 
## $quantInfo$frag_dist_length
## [1] 1001 1001 1001 1001 1001 1001 1001 1001
## 
## $quantInfo$seq_bias_correct
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 
## $quantInfo$gc_bias_correct
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 
## $quantInfo$num_bias_bins
## [1] 4096 4096 4096 4096 4096 4096 4096 4096
## 
## $quantInfo$mapping_type
## [1] "mapping" "mapping" "mapping" "mapping" "mapping" "mapping" "mapping"
## [8] "mapping"
## 
## $quantInfo$num_targets
## [1] 205870 205870 205870 205870 205870 205870 205870 205870
## 
## $quantInfo$serialized_eq_classes
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 
## $quantInfo$length_classes
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]
## [1,]    520    520    520    520    520    520    520    520
## [2,]    669    669    669    669    669    669    669    669
## [3,]   1065   1065   1065   1065   1065   1065   1065   1065
## [4,]   2328   2328   2328   2328   2328   2328   2328   2328
## [5,] 205012 205012 205012 205012 205012 205012 205012 205012
## 
## $quantInfo$index_seq_hash
## [1] "40849ed828ea7d6a94af54a5c40e5d87eb0ce0fc1e9513208a5cffe59d442292"
## 
## $quantInfo$index_name_hash
## [1] "77aca5545a0626421efb4730dd7b95482c77da261f9bdef70d36e25ee68bb7ef"
## 
## $quantInfo$index_seq_hash512
## [1] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [2] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [3] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [4] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [5] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [6] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [7] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## [8] "f37ae2a7412efd8518d68c22fc3fcc2478b59833809382abd5d9055475505e516730c70914343af34c9232af92fe22832e70afde6b38c26381097068e1e82551"
## 
## $quantInfo$index_name_hash512
## [1] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [2] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [3] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [4] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [5] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [6] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [7] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## [8] "67f286e5ec2895c10aa6e3a8feed04cb3a80747de7b3afb620298078481b303d2f979407fc104d4f3e8af0e82be2acb2f294ee5f2f68c00087f2855120d9f4ed"
## 
## $quantInfo$num_bootstraps
## [1] 0 0 0 0 0 0 0 0
## 
## $quantInfo$num_processed
## [1] 22935521 21155707 28136282 16823088 27298970 34298260 21275888 23487860
## 
## $quantInfo$num_mapped
## [1] 21072103 19264813 26088219 15654204 25210669 31839924 19643454 21784047
## 
## $quantInfo$percent_mapped
## [1] 91.87541 91.06201 92.72092 93.05191 92.35026 92.83248 92.32730 92.74598
## 
## $quantInfo$call
## [1] "quant" "quant" "quant" "quant" "quant" "quant" "quant" "quant"
## 
## $quantInfo$start_time
## [1] "Thu Mar 21 21:33:45 2019" "Thu Mar 21 21:35:50 2019"
## [3] "Thu Mar 21 21:37:52 2019" "Thu Mar 21 21:40:11 2019"
## [5] "Thu Mar 21 21:41:52 2019" "Thu Mar 21 21:44:04 2019"
## [7] "Thu Mar 21 21:46:38 2019" "Thu Mar 21 21:48:32 2019"
## 
## $quantInfo$end_time
## [1] "Thu Mar 21 21:35:49 2019" "Thu Mar 21 21:37:51 2019"
## [3] "Thu Mar 21 21:40:10 2019" "Thu Mar 21 21:41:51 2019"
## [5] "Thu Mar 21 21:44:02 2019" "Thu Mar 21 21:46:37 2019"
## [7] "Thu Mar 21 21:48:31 2019" "Thu Mar 21 21:50:28 2019"
## 
## 
## $countsFromAbundance
## [1] "no"
## 
## $level
## [1] "txp"
## 
## $txomeInfo
## $txomeInfo$source
## [1] "GENCODE"
## 
## $txomeInfo$organism
## [1] "Homo sapiens"
## 
## $txomeInfo$release
## [1] "29"
## 
## $txomeInfo$genome
## [1] "GRCh38"
## 
## $txomeInfo$fasta
## [1] "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz"
## 
## $txomeInfo$gtf
## [1] "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
## 
## $txomeInfo$sha256
## [1] "40849ed828ea7d6a94af54a5c40e5d87eb0ce0fc1e9513208a5cffe59d442292"
## 
## $txomeInfo$linkedTxome
## [1] FALSE
## 
## 
## $txdbInfo
##                                                                                            Db type 
##                                                                                             "TxDb" 
##                                                                                 Supporting package 
##                                                                                  "GenomicFeatures" 
##                                                                                        Data source 
## "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz" 
##                                                                                           Organism 
##                                                                                     "Homo sapiens" 
##                                                                                        Taxonomy ID 
##                                                                                             "9606" 
##                                                                                   miRBase build ID 
##                                                                                                 NA 
##                                                                                             Genome 
##                                                                                             "hg38" 
##                                                                                    transcript_nrow 
##                                                                                           "206694" 
##                                                                                          exon_nrow 
##                                                                                           "702834" 
##                                                                                           cds_nrow 
##                                                                                           "274013" 
##                                                                                      Db created by 
##                                                        "GenomicFeatures package from Bioconductor" 
##                                                                                      Creation time 
##                                                     "2019-10-07 10:00:06 -0400 (Mon, 07 Oct 2019)" 
##                                                           GenomicFeatures version at creation time 
##                                                                                           "1.36.4" 
##                                                                   RSQLite version at creation time 
##                                                                                            "2.1.2" 
##                                                                                    DBSCHEMAVERSION 
##                                                                                              "1.2"
colData(st)
## DataFrame with 8 rows and 9 columns
##             SampleName     cell      dex       albut       names avgLength
##            <character> <factor> <factor> <character> <character> <integer>
## SRR1039508  GSM1275862  N61311     untrt       untrt  SRR1039508       126
## SRR1039509  GSM1275863  N61311     trt         untrt  SRR1039509       126
## SRR1039512  GSM1275866  N052611    untrt       untrt  SRR1039512       126
## SRR1039513  GSM1275867  N052611    trt         untrt  SRR1039513        87
## SRR1039516  GSM1275870  N080611    untrt       untrt  SRR1039516       120
## SRR1039517  GSM1275871  N080611    trt         untrt  SRR1039517       126
## SRR1039520  GSM1275874  N061011    untrt       untrt  SRR1039520       101
## SRR1039521  GSM1275875  N061011    trt         untrt  SRR1039521        98
##             Experiment      Sample    BioSample
##            <character> <character>  <character>
## SRR1039508   SRX384345   SRS508568 SAMN02422669
## SRR1039509   SRX384346   SRS508567 SAMN02422675
## SRR1039512   SRX384349   SRS508571 SAMN02422678
## SRR1039513   SRX384350   SRS508572 SAMN02422670
## SRR1039516   SRX384353   SRS508575 SAMN02422682
## SRR1039517   SRX384354   SRS508576 SAMN02422673
## SRR1039520   SRX384357   SRS508579 SAMN02422683
## SRR1039521   SRX384358   SRS508580 SAMN02422677
rowData(st)
## DataFrame with 205870 rows and 3 columns
##                       tx_id           gene_id           tx_name
##                   <integer>   <CharacterList>       <character>
## ENST00000456328.2         1 ENSG00000223972.5 ENST00000456328.2
## ENST00000450305.2         2 ENSG00000223972.5 ENST00000450305.2
## ENST00000488147.1      9483 ENSG00000227232.5 ENST00000488147.1
## ENST00000619216.1      9484 ENSG00000278267.1 ENST00000619216.1
## ENST00000473358.1         3 ENSG00000243485.5 ENST00000473358.1
## ...                     ...               ...               ...
## ENST00000361681.2    206692 ENSG00000198695.2 ENST00000361681.2
## ENST00000387459.1    206693 ENSG00000210194.1 ENST00000387459.1
## ENST00000361789.2    206684 ENSG00000198727.2 ENST00000361789.2
## ENST00000387460.2    206685 ENSG00000210195.2 ENST00000387460.2
## ENST00000387461.2    206694 ENSG00000210196.2 ENST00000387461.2
assays(st) #plural
## List of length 3
## names(3): counts abundance length
head(assay(st)) # the first one by default
##                   SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENST00000456328.2     17.896     16.714     12.576      8.970     19.224
## ENST00000450305.2      0.000      0.000      0.000      0.000      0.000
## ENST00000488147.1     66.727     75.392    188.265     43.748     94.695
## ENST00000619216.1      0.000      0.000      0.000      0.000      0.000
## ENST00000473358.1      0.000      0.000      0.000      0.000      0.000
## ENST00000469289.1      0.000      0.000      0.000      0.000      0.000
##                   SRR1039517 SRR1039520 SRR1039521
## ENST00000456328.2      0.000      0.000      0.000
## ENST00000450305.2      0.000      0.000      0.000
## ENST00000488147.1     92.538     57.415     64.596
## ENST00000619216.1      0.000      0.000      1.000
## ENST00000473358.1      0.000      0.000      0.000
## ENST00000469289.1      0.000      1.059      0.000

However, we usually want to (start) analyse(ing) the data at the gene level

So we summarize the quantifications on the gene level.

sg <- tximeta::summarizeToGene(st)
## loading existing TxDb created: 2021-06-10 22:59:18
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2021-06-10 23:00:36
## summarizing abundance
## summarizing counts
## summarizing length

Did it work? how many transcripts?

dim(assay(st))
## [1] 205870      8

how many genes?

dim(assay(sg))
## [1] 58294     8

And, how does it look?

head(assay(sg))
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14    707.210    463.445    896.252    421.186   1185.870
## ENSG00000000005.5       0.000      0.000      0.000      0.000      0.000
## ENSG00000000419.12    454.000    508.999    606.000    352.999    583.000
## ENSG00000000457.13    311.880    266.807    358.017    217.296    354.140
## ENSG00000000460.16     91.451     74.463     52.361     45.144    107.779
## ENSG00000000938.12      0.000      0.000      2.000      0.000      1.000
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   1086.289    802.000    596.220
## ENSG00000000005.5       0.000      0.000      0.000
## ENSG00000000419.12    773.000    410.000    500.001
## ENSG00000000457.13    423.712    297.742    288.368
## ENSG00000000460.16    100.610     97.742     84.634
## ENSG00000000938.12      0.000      0.000      0.000

Looks fine, but given that we have the gene annotation available, let us add gene symbols

sg <- tximeta::addIds(sg, "SYMBOL", gene = TRUE)
## mapping to new IDs using org.Hs.eg.db
## if all matching IDs are desired, and '1:many mappings' are reported,
## set multiVals='list' to obtain all the matching IDs
## 'select()' returned 1:many mapping between keys and columns
head(rowData(sg))
## DataFrame with 6 rows and 3 columns
##                               gene_id
##                           <character>
## ENSG00000000003.14 ENSG00000000003.14
## ENSG00000000005.5   ENSG00000000005.5
## ENSG00000000419.12 ENSG00000000419.12
## ENSG00000000457.13 ENSG00000000457.13
## ENSG00000000460.16 ENSG00000000460.16
## ENSG00000000938.12 ENSG00000000938.12
##                                                                        tx_ids
##                                                               <CharacterList>
## ENSG00000000003.14  ENST00000612152.4,ENST00000373020.8,ENST00000614008.4,...
## ENSG00000000005.5                         ENST00000373031.4,ENST00000485971.1
## ENSG00000000419.12  ENST00000371588.9,ENST00000466152.5,ENST00000371582.8,...
## ENSG00000000457.13 ENST00000367771.10,ENST00000367770.5,ENST00000367772.8,...
## ENSG00000000460.16  ENST00000498289.5,ENST00000472795.5,ENST00000496973.5,...
## ENSG00000000938.12  ENST00000374005.7,ENST00000399173.5,ENST00000374004.5,...
##                         SYMBOL
##                    <character>
## ENSG00000000003.14      TSPAN6
## ENSG00000000005.5         TNMD
## ENSG00000000419.12        DPM1
## ENSG00000000457.13       SCYL3
## ENSG00000000460.16    C1orf112
## ENSG00000000938.12         FGR

Because the tools (DESeq2 and edgeR), we use downstream expect integers (non decimal counts), we round the values. This has no impact on the analysis

counts_salmon <- round(assay(sg, "counts"))

2.3 Comparison

Let’s compare featureCounts vs. Salmon for one sample

spl <- "SRR1039508"

for comparing, we need to make sure, both counts table are sorted in the same way. We create a data.frame containing both sorted according to their gene IDs

gns <- rownames(counts_salmon)
quants <- data.frame(featureCounts = counts_featurecounts[gns, spl],
                     salmon = counts_salmon[gns, spl])

head(quants)
##                    featureCounts salmon
## ENSG00000000003.14           702    707
## ENSG00000000005.5              0      0
## ENSG00000000419.12           467    454
## ENSG00000000457.13           279    312
## ENSG00000000460.16            62     91
## ENSG00000000938.12             0      0

Let us plot the raw counts

heatscatter(quants$featureCounts,quants$salmon,
            xlab="featureCounts",ylab="salmon",
            main="scatterplot (raw counts)")
abline(0,1,lty=2,col="black")

Let us go onto a log scale

heatscatter(log10(quants$featureCounts+1),log10(quants$salmon+1),
            xlab="featureCounts",ylab="salmon",
            main="scatterplot (log10 counts + 1)")
abline(0,1,lty=2,col="black")

heatscatter(log1p(quants$featureCounts),log1p(quants$salmon),
            xlab="featureCounts",ylab="salmon",
            main="scatterplot (log counts + 1)")
abline(0,1,lty=2,col="black")

3 Comparison after normalisation

We will use two commonly used tools to achieve the same:

DESeq2 and edgeR

Why? Because you are likely to encounter both in the literature

Differences? Little - mostly taste. DESeq2 abstracts a lot of the decisions from the user, while edgeR leave these all to the user.

Let us take a look at the information we have about our samples to build our model

colData(sg)
## DataFrame with 8 rows and 9 columns
##             SampleName     cell      dex       albut       names avgLength
##            <character> <factor> <factor> <character> <character> <integer>
## SRR1039508  GSM1275862  N61311     untrt       untrt  SRR1039508       126
## SRR1039509  GSM1275863  N61311     trt         untrt  SRR1039509       126
## SRR1039512  GSM1275866  N052611    untrt       untrt  SRR1039512       126
## SRR1039513  GSM1275867  N052611    trt         untrt  SRR1039513        87
## SRR1039516  GSM1275870  N080611    untrt       untrt  SRR1039516       120
## SRR1039517  GSM1275871  N080611    trt         untrt  SRR1039517       126
## SRR1039520  GSM1275874  N061011    untrt       untrt  SRR1039520       101
## SRR1039521  GSM1275875  N061011    trt         untrt  SRR1039521        98
##             Experiment      Sample    BioSample
##            <character> <character>  <character>
## SRR1039508   SRX384345   SRS508568 SAMN02422669
## SRR1039509   SRX384346   SRS508567 SAMN02422675
## SRR1039512   SRX384349   SRS508571 SAMN02422678
## SRR1039513   SRX384350   SRS508572 SAMN02422670
## SRR1039516   SRX384353   SRS508575 SAMN02422682
## SRR1039517   SRX384354   SRS508576 SAMN02422673
## SRR1039520   SRX384357   SRS508579 SAMN02422683
## SRR1039521   SRX384358   SRS508580 SAMN02422677

cell and dex are 2 variables where we have different values

Based on the original publication from this data, we know that they are relevant

colData(sg)$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: trt untrt
colData(sg)$cell 
## [1] N61311  N61311  N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311

Both are factors (categorical variables), but for dex, the treatment comes before the control and this implies that the treatment would be the reference we compare to. It would be easier to think the other way around, so we change the order of the level

colData(sg)$dex <- relevel(colData(sg)$dex, ref = "untrt")
colData(sg)$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: untrt trt

3.1 DESeq2

Let us create the DESeq objects

ds_se <- DESeqDataSet(sg, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta

some sanity checking

stopifnot(all(colnames(counts_salmon) == rownames(meta)))

Here is another way to create the same object, relevant if your organism is not among the ones available at Gencode.

meta$dex <- relevel(meta$dex, ref = "untrt")
ds_matrix <- DESeqDataSetFromMatrix(countData = counts_salmon, 
                                    colData = meta,
                                    design = ~ cell + dex)
## converting counts to integer mode

3.2 edgeR

Create the object

genetable <- data.frame(gene.id = rownames(counts_salmon),
                        stringsAsFactors = FALSE)
stopifnot(all(rownames(meta) == colnames(counts_salmon)))
dge <- DGEList(counts = counts_salmon, 
               samples = meta, 
               genes = genetable)
names(dge)
## [1] "counts"  "samples" "genes"

Here is an illustration of what I meant earlier.

edgeR leaves a lot for the user to do

Here we retrieve the average transcript length, use it to correct the salmon expression values, before calculating the library size factor (i.e. the difference in sequencing depth between samples).

avetxlengths <- assay(sg, "length")
stopifnot(all(rownames(avetxlengths) == rownames(counts_salmon)))
stopifnot(all(colnames(avetxlengths) == colnames(counts_salmon)))
avetxlengths <- avetxlengths/exp(rowMeans(log(avetxlengths)))
offsets <- log(calcNormFactors(counts_salmon/avetxlengths)) + 
  log(colSums(counts_salmon/avetxlengths))
dge <- scaleOffset(dge, t(t(log(avetxlengths)) + offsets))
names(dge)
## [1] "counts"  "samples" "genes"   "offset"
dge <- edgeR::calcNormFactors(dge)
## Warning in calcNormFactors.DGEList(dge): object contains offsets, which take precedence over library
## sizes and norm factors (and which will not be recomputed).
dge$samples
##            group lib.size norm.factors SampleName    cell   dex albut
## SRR1039508     1 21071980    1.0560801 GSM1275862  N61311 untrt untrt
## SRR1039509     1 19264691    1.0298325 GSM1275863  N61311   trt untrt
## SRR1039512     1 26088082    0.9857234 GSM1275866 N052611 untrt untrt
## SRR1039513     1 15654069    0.9485718 GSM1275867 N052611   trt untrt
## SRR1039516     1 25210569    1.0297904 GSM1275870 N080611 untrt untrt
## SRR1039517     1 31839813    0.9647851 GSM1275871 N080611   trt untrt
## SRR1039520     1 19643330    1.0345312 GSM1275874 N061011 untrt untrt
## SRR1039521     1 21783926    0.9567274 GSM1275875 N061011   trt untrt
##                 names avgLength Experiment    Sample    BioSample
## SRR1039508 SRR1039508       126  SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRR1039509       126  SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRR1039512       126  SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRR1039513        87  SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRR1039516       120  SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRR1039517       126  SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRR1039520       101  SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRR1039521        98  SRX384358 SRS508580 SAMN02422677

Let us visualise the library size factor differences

It is surprisingly similar across all samples, a +/- 5% difference (we are using a toy dataset obviously)

boxplot(dge$samples$norm.factors)

3.3 Comparison

Neither DESeq2 nor edgeR modify the count data. They calculate the parameters (such as the library size factor, the dispersion, etc.) to use in their model. If we want to visualise the data in a reduced dimensionality (not 50+ thousands genes and x samples as a matrix), such as a PCA (principal component analysis), or MDS (Multi-dimensional scaling), we need to normalise the data.

In addition to a difference in sequencing depth between samples, RNA-Seq data suffers from another problem, a mean-variance relationship, i.e. the data is heteroskedastic.

meanSdPlot(log(assay(ds_se)[rowSums(assay(ds_se))>0,]))
## Warning: Removed 16307 rows containing non-finite values (stat_binhex).

To counteract this, one can use a variance stabilising transformation (VST), a heuristic that will transform the variance so it becomes independent of the mean

vsd <- DESeq2::vst(ds_se)
## using 'avgTxLength' from assays(dds), correcting for library size

Et voila! Not perfect but good enough for visualisation

meanSdPlot(log(assay(vsd)[rowSums(assay(vsd))>0,]))

4 Visualisation

Finally the Biological QA ## DESeq2

DESeq2::plotPCA(vsd, intgroup = "cell")

DESeq2::plotPCA(vsd, intgroup = "dex")

4.1 edgeR

plotMDS(dge, top = 500, 
        labels = NULL, 
        col = as.numeric(dge$samples$dex), 
        pch = as.numeric(dge$samples$cell), 
        cex = 2, gene.selection = "common")

4.2 my own take at it

vst <- assay(vsd)
vst <- vst - min(vst)
meanSdPlot(vst[rowSums(vst)>0,])

4.2.1 QC on the normalised data

4.2.1.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=2

And the number of possible combinations

nlevel=nlevels(ds_se$dex) * nlevels(ds_se$cell)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

4.2.2 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(ds_se)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=dex,shape=cell,text=SampleName)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

4.2.3 Sequencing depth

Number of genes expressed per condition at different cutoffs

conds <- ds_se$dex
dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=conds,
                                nrep=3)

4.2.4 Heatmap

Filter for noise

sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3,scale=TRUE)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 9 y values <= 0 omitted from
## logarithmic plot

cutoff.pos <- 2
  • Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(vst[sels[[cutoff.pos]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="")

4.2.5 Hierarchical clustering

Done to assess the previous dendrogram’s reproducibility

hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[cutoff.pos]],]))),
                      method.hclust = "ward.D2", 
                      nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 63 nodes on host 'localhost'
## Multiscale bootstrap... Done.

plot the clustering with bp and au

plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)

Somewhat fancier

tree2 <- full_join(as.treedata(hm.pvclust),
                   coldata %>% rename(label=names), by='label')
## Joining, by = "label"
ggtree(tree2) + 
  geom_tippoint(aes(shape=cell)) +
  geom_tiplab(size=3,aes(label=dex),hjust=-0.5) +
  geom_nodelab(aes(label=au,color=au),hjust=-0.5,size=3) + 
  scale_color_continuous(low="red", high="blue")

bootstrapping results as a table

print(hm.pvclust, digits=3)
## 
## Cluster method: ward.D2
## Distance      : correlation
## 
## Estimates on edges:
## 
##      si    au    bp se.si se.au se.bp      v      c  pchi
## 1 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 2 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 3 0.972 0.986 0.987 0.012 0.007 0.001 -2.214 -0.018 0.969
## 4 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 6 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 7 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000

5 Session Info

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] GenomicFeatures_1.42.3      vsn_3.58.0                 
##  [3] tximeta_1.8.4               treeio_1.14.4              
##  [5] forcats_0.5.1               stringr_1.4.0              
##  [7] dplyr_1.0.5                 purrr_0.3.4                
##  [9] readr_1.4.0                 tidyr_1.1.3                
## [11] tibble_3.1.0                tidyverse_1.3.0            
## [13] RColorBrewer_1.1-2          pvclust_2.2-0              
## [15] plotly_4.9.3                org.Hs.eg.db_3.12.0        
## [17] AnnotationDbi_1.52.0        LSD_4.1-0                  
## [19] hyperSpec_0.99-20201127     xml2_1.3.2                 
## [21] ggplot2_3.3.3               lattice_0.20-41            
## [23] gplots_3.1.1                ggtree_2.4.2               
## [25] here_1.0.1                  edgeR_3.32.1               
## [27] limma_3.46.0                DESeq2_1.30.1              
## [29] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [31] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [33] GenomicRanges_1.42.0        GenomeInfoDb_1.26.7        
## [35] IRanges_2.24.1              S4Vectors_0.28.1           
## [37] BiocGenerics_0.36.0        
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                  backports_1.2.1              
##   [3] AnnotationHub_2.22.0          BiocFileCache_1.14.0         
##   [5] lazyeval_0.2.2                splines_4.0.5                
##   [7] crosstalk_1.1.1               BiocParallel_1.24.1          
##   [9] digest_0.6.27                 ensembldb_2.14.0             
##  [11] htmltools_0.5.1.1             fansi_0.4.2                  
##  [13] magrittr_2.0.1                memoise_2.0.0                
##  [15] Biostrings_2.58.0             annotate_1.68.0              
##  [17] modelr_0.1.8                  askpass_1.1                  
##  [19] prettyunits_1.1.1             jpeg_0.1-8.1                 
##  [21] colorspace_2.0-0              rappdirs_0.3.3               
##  [23] blob_1.2.1                    rvest_1.0.0                  
##  [25] haven_2.3.1                   xfun_0.22                    
##  [27] hexbin_1.28.2                 tximport_1.18.0              
##  [29] crayon_1.4.1                  RCurl_1.98-1.3               
##  [31] jsonlite_1.7.2                genefilter_1.72.1            
##  [33] survival_3.2-10               ape_5.5                      
##  [35] glue_1.4.2                    gtable_0.3.0                 
##  [37] zlibbioc_1.36.0               XVector_0.30.0               
##  [39] DelayedArray_0.16.3           scales_1.1.1                 
##  [41] DBI_1.1.1                     Rcpp_1.0.6                   
##  [43] progress_1.2.2                viridisLite_0.3.0            
##  [45] xtable_1.8-4                  tidytree_0.3.4               
##  [47] bit_4.0.4                     preprocessCore_1.52.1        
##  [49] htmlwidgets_1.5.3             httr_1.4.2                   
##  [51] ellipsis_0.3.1                farver_2.1.0                 
##  [53] pkgconfig_2.0.3               XML_3.99-0.6                 
##  [55] sass_0.3.1                    dbplyr_2.1.1                 
##  [57] locfit_1.5-9.4                utf8_1.2.1                   
##  [59] labeling_0.4.2                later_1.1.0.1                
##  [61] tidyselect_1.1.0              rlang_0.4.10                 
##  [63] munsell_0.5.0                 BiocVersion_3.12.0           
##  [65] cellranger_1.1.0              tools_4.0.5                  
##  [67] cachem_1.0.4                  cli_2.4.0                    
##  [69] generics_0.1.0                RSQLite_2.2.6                
##  [71] broom_0.7.6                   evaluate_0.14                
##  [73] fastmap_1.1.0                 yaml_2.2.1                   
##  [75] knitr_1.31                    bit64_4.0.5                  
##  [77] fs_1.5.0                      caTools_1.18.2               
##  [79] AnnotationFilter_1.14.0       nlme_3.1-152                 
##  [81] mime_0.10                     aplot_0.0.6                  
##  [83] biomaRt_2.46.3                compiler_4.0.5               
##  [85] rstudioapi_0.13               interactiveDisplayBase_1.28.0
##  [87] curl_4.3                      png_0.1-7                    
##  [89] affyio_1.60.0                 testthat_3.0.2               
##  [91] reprex_2.0.0                  geneplotter_1.68.0           
##  [93] bslib_0.2.4                   stringi_1.5.3                
##  [95] highr_0.8                     ProtGenerics_1.22.0          
##  [97] Matrix_1.3-2                  vctrs_0.3.7                  
##  [99] pillar_1.5.1                  lifecycle_1.0.0              
## [101] BiocManager_1.30.12           jquerylib_0.1.3              
## [103] data.table_1.14.0             bitops_1.0-6                 
## [105] rtracklayer_1.50.0            httpuv_1.5.5                 
## [107] patchwork_1.1.1               affy_1.68.0                  
## [109] R6_2.5.0                      latticeExtra_0.6-29          
## [111] promises_1.2.0.1              KernSmooth_2.23-18           
## [113] gtools_3.8.2                  assertthat_0.2.1             
## [115] openssl_1.4.3                 rprojroot_2.0.2              
## [117] withr_2.4.1                   GenomicAlignments_1.26.0     
## [119] Rsamtools_2.6.0               GenomeInfoDbData_1.2.4       
## [121] hms_1.0.0                     rmarkdown_2.7                
## [123] rvcheck_0.1.8                 shiny_1.6.0                  
## [125] lubridate_1.7.10